import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
from matplotlib import pyplot as plt
%matplotlib inline
#html export
import plotly.io as pio
pio.renderers.default = 'notebook'
df = pd.read_csv('cardata.csv')
df
| Make | Model | Year | Engine Fuel Type | Engine HP | Engine Cylinders | Transmission Type | Driven_Wheels | Number of Doors | Market Category | Vehicle Size | Vehicle Style | highway MPG | city mpg | Popularity | MSRP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BMW | 1 Series M | 2011 | premium unleaded (required) | 335.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Factory Tuner,Luxury,High-Performance | Compact | Coupe | 26 | 19 | 3916 | 46135 |
| 1 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Convertible | 28 | 19 | 3916 | 40650 |
| 2 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,High-Performance | Compact | Coupe | 28 | 20 | 3916 | 36350 |
| 3 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Coupe | 28 | 18 | 3916 | 29450 |
| 4 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury | Compact | Convertible | 28 | 18 | 3916 | 34500 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11909 | Acura | ZDX | 2012 | premium unleaded (required) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 46120 |
| 11910 | Acura | ZDX | 2012 | premium unleaded (required) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 56670 |
| 11911 | Acura | ZDX | 2012 | premium unleaded (required) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 50620 |
| 11912 | Acura | ZDX | 2013 | premium unleaded (recommended) | 300.0 | 6.0 | AUTOMATIC | all wheel drive | 4.0 | Crossover,Hatchback,Luxury | Midsize | 4dr Hatchback | 23 | 16 | 204 | 50920 |
| 11913 | Lincoln | Zephyr | 2006 | regular unleaded | 221.0 | 6.0 | AUTOMATIC | front wheel drive | 4.0 | Luxury | Midsize | Sedan | 26 | 17 | 61 | 28995 |
11914 rows × 16 columns
df.columns
Index(['Make', 'Model', 'Year', 'Engine Fuel Type', 'Engine HP',
'Engine Cylinders', 'Transmission Type', 'Driven_Wheels',
'Number of Doors', 'Market Category', 'Vehicle Size', 'Vehicle Style',
'highway MPG', 'city mpg', 'Popularity', 'MSRP'],
dtype='object')
# make column titles consistent
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | BMW | 1 Series M | 2011 | premium unleaded (required) | 335.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Factory Tuner,Luxury,High-Performance | Compact | Coupe | 26 | 19 | 3916 | 46135 |
| 1 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Convertible | 28 | 19 | 3916 | 40650 |
| 2 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,High-Performance | Compact | Coupe | 28 | 20 | 3916 | 36350 |
| 3 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Coupe | 28 | 18 | 3916 | 29450 |
| 4 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury | Compact | Convertible | 28 | 18 | 3916 | 34500 |
# getting columns with str type
strings = list(df.dtypes[df.dtypes == 'object'].index)
strings
['make', 'model', 'engine_fuel_type', 'transmission_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style']
# ensuring string data has a standard format
for col in strings:
df[col] = df[col].str.lower().str.replace(' ', '_')
df.head()
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bmw | 1_series_m | 2011 | premium_unleaded_(required) | 335.0 | 6.0 | manual | rear_wheel_drive | 2.0 | factory_tuner,luxury,high-performance | compact | coupe | 26 | 19 | 3916 | 46135 |
| 1 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 28 | 19 | 3916 | 40650 |
| 2 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 36350 |
| 3 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 29450 |
| 4 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 34500 |
Exploratory data analysis¶
for col in df.columns:
print(col)
print(df[col].unique()[:5]) # first 5 unique values
print(df[col].nunique())
print()
make ['bmw' 'audi' 'fiat' 'mercedes-benz' 'chrysler'] 48 model ['1_series_m' '1_series' '100' '124_spider' '190-class'] 914 year [2011 2012 2013 1992 1993] 28 engine_fuel_type ['premium_unleaded_(required)' 'regular_unleaded' 'premium_unleaded_(recommended)' 'flex-fuel_(unleaded/e85)' 'diesel'] 10 engine_hp [335. 300. 230. 320. 172.] 356 engine_cylinders [ 6. 4. 5. 8. 12.] 9 transmission_type ['manual' 'automatic' 'automated_manual' 'direct_drive' 'unknown'] 5 driven_wheels ['rear_wheel_drive' 'front_wheel_drive' 'all_wheel_drive' 'four_wheel_drive'] 4 number_of_doors [ 2. 4. 3. nan] 3 market_category ['factory_tuner,luxury,high-performance' 'luxury,performance' 'luxury,high-performance' 'luxury' 'performance'] 71 vehicle_size ['compact' 'midsize' 'large'] 3 vehicle_style ['coupe' 'convertible' 'sedan' 'wagon' '4dr_hatchback'] 16 highway_mpg [26 28 27 25 24] 59 city_mpg [19 20 18 17 16] 69 popularity [3916 3105 819 617 1013] 48 msrp [46135 40650 36350 29450 34500] 6049
Distribution of price¶
sns.histplot(df['msrp'], bins = 50) # bins == no. of bars
<Axes: xlabel='msrp', ylabel='Count'>
px.histogram(df, x=df['msrp'])
# less expensive cars
sns.histplot(df.msrp[df['msrp'] < 100000], bins = 50)
<Axes: xlabel='msrp', ylabel='Count'>
# less expensive cars
px.histogram(df, x=df.msrp[df['msrp'] < 100000])
∴ The prices have a long tail distribution as most prices are cheap and few expensive ones
This type of distribution is not good for ML as the tail will confuse our model
Hence we get rid of the tail by applying logarithmic distribution to the price
# example of how a logarithmic distribution behaves
np.log1p([1, 10, 1000, 100000])
array([ 0.69314718, 2.39789527, 6.90875478, 11.51293546])
price_logs = np.log1p(df['msrp'])
sns.histplot(price_logs)
<Axes: xlabel='msrp', ylabel='Count'>
When you use np.log1p (the natural logarithm of 1+𝑥), it compresses larger values more than smaller ones. This transformation reduces the gap between extremely high and moderate prices, effectively pulling the long tail back in into a normal distribution
Missing values¶
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11914 entries, 0 to 11913 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 make 11914 non-null object 1 model 11914 non-null object 2 year 11914 non-null int64 3 engine_fuel_type 11911 non-null object 4 engine_hp 11845 non-null float64 5 engine_cylinders 11884 non-null float64 6 transmission_type 11914 non-null object 7 driven_wheels 11914 non-null object 8 number_of_doors 11908 non-null float64 9 market_category 8172 non-null object 10 vehicle_size 11914 non-null object 11 vehicle_style 11914 non-null object 12 highway_mpg 11914 non-null int64 13 city_mpg 11914 non-null int64 14 popularity 11914 non-null int64 15 msrp 11914 non-null int64 dtypes: float64(3), int64(5), object(8) memory usage: 1.5+ MB
df.isna().sum()
make 0 model 0 year 0 engine_fuel_type 3 engine_hp 69 engine_cylinders 30 transmission_type 0 driven_wheels 0 number_of_doors 6 market_category 3742 vehicle_size 0 vehicle_style 0 highway_mpg 0 city_mpg 0 popularity 0 msrp 0 dtype: int64
Setting up a validation framework¶
# entire dataset
n = len(df)
n
11914
# dividing dataset into 20% Validate, 20% Test, 60% Train,
n_val = int(len(df) * 0.2)
print(n_val)
print()
n_test = int(len(df) * 0.2)
print(n_test)
print()
n_train = int(len(df) * 0.6)
print(n_train)
print()
2382 2382 7148
n , n_val + n_test + n_train
(11914, 11912)
# to ensure all records are used
n_val = int(len(df) * 0.2)
print(n_val)
n_test = int(len(df) * 0.2)
print(n_test)
n_train = n - n_val - n_test
print(n_train)
2382 2382 7150
(n , n_val + n_test + n_train)
(11914, 11914)
# subdividing the data
df_val = df[:n_val]
df_val
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bmw | 1_series_m | 2011 | premium_unleaded_(required) | 335.0 | 6.0 | manual | rear_wheel_drive | 2.0 | factory_tuner,luxury,high-performance | compact | coupe | 26 | 19 | 3916 | 46135 |
| 1 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 28 | 19 | 3916 | 40650 |
| 2 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 36350 |
| 3 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 29450 |
| 4 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 34500 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2377 | porsche | cayenne | 2016 | premium_unleaded_(required) | 570.0 | 8.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,high-performance | midsize | 4dr_suv | 21 | 14 | 1715 | 157300 |
| 2378 | porsche | cayenne | 2016 | diesel | 240.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,diesel | midsize | 4dr_suv | 29 | 20 | 1715 | 62300 |
| 2379 | porsche | cayenne | 2017 | premium_unleaded_(required) | 520.0 | 8.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,performance | midsize | 4dr_suv | 21 | 14 | 1715 | 116500 |
| 2380 | porsche | cayenne | 2017 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury | midsize | 4dr_suv | 24 | 18 | 1715 | 59600 |
| 2381 | porsche | cayenne | 2017 | premium_unleaded_(required) | 440.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,high-performance | midsize | 4dr_suv | 23 | 16 | 1715 | 97200 |
2382 rows × 16 columns
df_test = df[n_val:n_val + n_test]
df_test
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2382 | porsche | cayenne | 2017 | premium_unleaded_(required) | 570.0 | 8.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,high-performance | midsize | 4dr_suv | 21 | 14 | 1715 | 159600 |
| 2383 | porsche | cayenne | 2017 | premium_unleaded_(required) | 420.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,luxury,performance | midsize | 4dr_suv | 24 | 17 | 1715 | 76200 |
| 2384 | porsche | cayman_s | 2006 | premium_unleaded_(required) | 295.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 26 | 18 | 1715 | 58900 |
| 2385 | porsche | cayman | 2014 | premium_unleaded_(required) | 275.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 30 | 20 | 1715 | 52600 |
| 2386 | porsche | cayman | 2014 | premium_unleaded_(required) | 325.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 1715 | 63800 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4759 | ford | flex | 2016 | premium_unleaded_(recommended) | 365.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover | large | wagon | 21 | 15 | 5657 | 42600 |
| 4760 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 32300 |
| 4761 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover | large | wagon | 23 | 16 | 5657 | 29600 |
| 4762 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,performance | large | wagon | 22 | 16 | 5657 | 34250 |
| 4763 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,performance | large | wagon | 22 | 16 | 5657 | 39750 |
2382 rows × 16 columns
df_train = df[n_val + n_test:]
df_train
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4764 | ford | flex | 2016 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 37800 |
| 4765 | ford | flex | 2017 | premium_unleaded_(recommended) | 365.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover | large | wagon | 21 | 15 | 5657 | 43030 |
| 4766 | ford | flex | 2017 | regular_unleaded | 287.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,performance | large | wagon | 22 | 16 | 5657 | 40180 |
| 4767 | ford | flex | 2017 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 32730 |
| 4768 | ford | flex | 2017 | regular_unleaded | 287.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover,performance | large | wagon | 23 | 16 | 5657 | 38230 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11909 | acura | zdx | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 46120 |
| 11910 | acura | zdx | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 56670 |
| 11911 | acura | zdx | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 50620 |
| 11912 | acura | zdx | 2013 | premium_unleaded_(recommended) | 300.0 | 6.0 | automatic | all_wheel_drive | 4.0 | crossover,hatchback,luxury | midsize | 4dr_hatchback | 23 | 16 | 204 | 50920 |
| 11913 | lincoln | zephyr | 2006 | regular_unleaded | 221.0 | 6.0 | automatic | front_wheel_drive | 4.0 | luxury | midsize | sedan | 26 | 17 | 61 | 28995 |
7150 rows × 16 columns
The data is sequential and we dont want that when training our model
# shuffling the records
np.arange(n)
array([ 0, 1, 2, ..., 11911, 11912, 11913])
idx = np.arange(n)
df.iloc[idx[:10]]
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | bmw | 1_series_m | 2011 | premium_unleaded_(required) | 335.0 | 6.0 | manual | rear_wheel_drive | 2.0 | factory_tuner,luxury,high-performance | compact | coupe | 26 | 19 | 3916 | 46135 |
| 1 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 28 | 19 | 3916 | 40650 |
| 2 | bmw | 1_series | 2011 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 36350 |
| 3 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 29450 |
| 4 | bmw | 1_series | 2011 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 34500 |
| 5 | bmw | 1_series | 2012 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | coupe | 28 | 18 | 3916 | 31200 |
| 6 | bmw | 1_series | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,performance | compact | convertible | 26 | 17 | 3916 | 44100 |
| 7 | bmw | 1_series | 2012 | premium_unleaded_(required) | 300.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury,high-performance | compact | coupe | 28 | 20 | 3916 | 39300 |
| 8 | bmw | 1_series | 2012 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 28 | 18 | 3916 | 36900 |
| 9 | bmw | 1_series | 2013 | premium_unleaded_(required) | 230.0 | 6.0 | manual | rear_wheel_drive | 2.0 | luxury | compact | convertible | 27 | 18 | 3916 | 37200 |
np.random.seed(2) # to get the same random numbers every time you run your code
np.random.shuffle(idx)
print(idx)
[2735 6720 5878 ... 6637 2575 7336]
# confirming whether the dataset is shuffled by id
df.iloc[idx[:10]] # first 10 records of shuffled dataset
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2735 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 | 14410 |
| 6720 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 | 19685 |
| 5878 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 | 19795 |
| 11190 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 | 2000 |
| 4554 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 | 56260 |
| 8001 | volkswagen | rabbit | 2008 | regular_unleaded | 170.0 | 5.0 | manual | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 29 | 22 | 873 | 17575 |
| 2882 | bentley | continental_gtc | 2013 | premium_unleaded_(required) | 500.0 | 8.0 | automatic | all_wheel_drive | 2.0 | exotic,luxury,high-performance | midsize | convertible | 24 | 14 | 520 | 191400 |
| 649 | bmw | 6_series | 2015 | premium_unleaded_(required) | 315.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | coupe | 32 | 21 | 3916 | 76100 |
| 616 | maybach | 57 | 2012 | premium_unleaded_(required) | 543.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury | large | sedan | 16 | 10 | 67 | 379050 |
| 4459 | ford | f-150_heritage | 2004 | regular_unleaded | 202.0 | 6.0 | manual | four_wheel_drive | 2.0 | NaN | large | regular_cab_pickup | 18 | 13 | 5657 | 26030 |
# indexes of the last 60% of the dataset
idx[n_train:]
array([2779, 3708, 4794, ..., 6637, 2575, 7336])
df_train = df.iloc[idx[:n_train]] # first 60 % to be for training
df_train
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2735 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 | 14410 |
| 6720 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 | 19685 |
| 5878 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 | 19795 |
| 11190 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 | 2000 |
| 4554 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 | 56260 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 434 | bmw | 4_series | 2015 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | convertible | 31 | 20 | 3916 | 54900 |
| 1902 | volkswagen | beetle | 2015 | premium_unleaded_(recommended) | 210.0 | 4.0 | automated_manual | front_wheel_drive | 2.0 | hatchback,performance | compact | 2dr_hatchback | 30 | 24 | 873 | 29215 |
| 9334 | gmc | sierra_1500 | 2015 | flex-fuel_(unleaded/e85) | 285.0 | 6.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | extended_cab_pickup | 22 | 17 | 549 | 34675 |
| 5284 | rolls-royce | ghost | 2014 | premium_unleaded_(required) | 563.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury,performance | large | sedan | 21 | 13 | 86 | 303300 |
| 2420 | volkswagen | cc | 2017 | premium_unleaded_(recommended) | 200.0 | 4.0 | automated_manual | front_wheel_drive | 4.0 | performance | midsize | sedan | 31 | 22 | 873 | 37820 |
7150 rows × 16 columns
df_val = df.iloc[idx[n_train:n_train + n_val]]
df_test = df.iloc[idx[n_train + n_val:]]
df_val
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2779 | chevrolet | colorado | 2015 | regular_unleaded | 200.0 | 4.0 | automatic | four_wheel_drive | 4.0 | NaN | compact | extended_cab_pickup | 25 | 19 | 1385 | 26885 |
| 3708 | mercedes-benz | e-class | 2017 | premium_unleaded_(required) | 241.0 | 4.0 | automatic | all_wheel_drive | 4.0 | luxury | midsize | sedan | 29 | 22 | 617 | 54650 |
| 4794 | ford | focus | 2017 | flex-fuel_(unleaded/e85) | 160.0 | 4.0 | manual | front_wheel_drive | 4.0 | flex_fuel | compact | sedan | 36 | 26 | 5657 | 16775 |
| 10498 | acura | tlx | 2016 | premium_unleaded_(recommended) | 290.0 | 6.0 | automatic | front_wheel_drive | 4.0 | luxury | midsize | sedan | 34 | 21 | 204 | 42600 |
| 1880 | volkswagen | beetle_convertible | 2016 | regular_unleaded | 170.0 | 4.0 | automatic | front_wheel_drive | 2.0 | NaN | compact | convertible | 34 | 25 | 873 | 25995 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11123 | volvo | v60 | 2015 | regular_unleaded | 240.0 | 4.0 | automatic | front_wheel_drive | 4.0 | luxury | midsize | wagon | 37 | 25 | 870 | 35750 |
| 5549 | maserati | granturismo_convertible | 2015 | premium_unleaded_(required) | 444.0 | 8.0 | automatic | rear_wheel_drive | 2.0 | exotic,luxury,high-performance | midsize | convertible | 20 | 13 | 238 | 145740 |
| 4146 | cadillac | escalade_hybrid | 2013 | regular_unleaded | 332.0 | 8.0 | automatic | rear_wheel_drive | 4.0 | luxury,hybrid | large | 4dr_suv | 23 | 20 | 1624 | 74425 |
| 6337 | mitsubishi | lancer | 2016 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 4.0 | NaN | compact | sedan | 34 | 24 | 436 | 17595 |
| 9814 | kia | sorento | 2015 | regular_unleaded | 290.0 | 6.0 | automatic | front_wheel_drive | 4.0 | crossover | midsize | 4dr_suv | 25 | 18 | 1720 | 26700 |
2382 rows × 16 columns
# resetting the index with new shuffled dataframe
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | msrp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 | 14410 |
| 1 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 | 19685 |
| 2 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 | 19795 |
| 3 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 | 2000 |
| 4 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 | 56260 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7145 | bmw | 4_series | 2015 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | convertible | 31 | 20 | 3916 | 54900 |
| 7146 | volkswagen | beetle | 2015 | premium_unleaded_(recommended) | 210.0 | 4.0 | automated_manual | front_wheel_drive | 2.0 | hatchback,performance | compact | 2dr_hatchback | 30 | 24 | 873 | 29215 |
| 7147 | gmc | sierra_1500 | 2015 | flex-fuel_(unleaded/e85) | 285.0 | 6.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | extended_cab_pickup | 22 | 17 | 549 | 34675 |
| 7148 | rolls-royce | ghost | 2014 | premium_unleaded_(required) | 563.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury,performance | large | sedan | 21 | 13 | 86 | 303300 |
| 7149 | volkswagen | cc | 2017 | premium_unleaded_(recommended) | 200.0 | 4.0 | automated_manual | front_wheel_drive | 4.0 | performance | midsize | sedan | 31 | 22 | 873 | 37820 |
7150 rows × 16 columns
df_train['msrp'].values
array([ 14410, 19685, 19795, ..., 34675, 303300, 37820], dtype=int64)
np.log1p(df_train['msrp'].values)
array([ 9.57574708, 9.887663 , 9.89323518, ..., 10.45380308,
12.62248099, 10.54061978])
y_train = np.log1p(df_train['msrp'].values)
y_val = np.log1p(df_val['msrp'].values)
y_test = np.log1p(df_test['msrp'].values)
# delete msrp variable to avoid accidentally using it
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']
len(y_train)
7150
Linear Regression¶
df_train.iloc[10]
make rolls-royce model phantom_drophead_coupe year 2015 engine_fuel_type premium_unleaded_(required) engine_hp 453.0 engine_cylinders 12.0 transmission_type automatic driven_wheels rear_wheel_drive number_of_doors 2.0 market_category exotic,luxury,performance vehicle_size large vehicle_style convertible highway_mpg 19 city_mpg 11 popularity 86 Name: 10, dtype: object
# taking engine_hp, city_mpg, popularity as our features
xi = df_train.loc[10, ['engine_hp', 'city_mpg', 'popularity']]
xi
engine_hp 453.0 city_mpg 11 popularity 86 Name: 10, dtype: object
xi = xi.values
xi
array([453.0, 11, 86], dtype=object)
The linear model:
$$ g(X_i) = W_0 + W_1 X_{i1} + W_2 X_{i2} + W_3 X_{i3} $$
w0 = 7.17 # bias term - prediction we make w/o knowing anything about the car
w = [0.01, 0.04, 0.002] # weight for each feature
def linear_regression(xi):
n = len(xi)
pred = w0
for j in range(n):
pred = pred + w[j] * xi[j]
return pred
linear_regression(xi)
12.312
# undoing the log we applied
np.expm1(12.312)
222347.2221101062
np.log1p(222347.2221101062)
12.312
Linear Regression Vector form¶
# dot product for features and weights
def dot(xi, w):
n = len(xi)
res = 0.0
for j in range(n):
res = res + xi[j] * w[j]
return res
def linear_regression(xi):
return w0 + dot(xi, w)
performing dot product :
$$ g(X_i) = W_0 + (X_i)^T \cdot W $$
Dot product between the transposed feature vector and the weight vector: $$ (X_i)^T \cdot W = X_{i1}W_1 + X_{i2}W_2 + X_{i3}W_3 + \dots + X_{in}W_n $$
Bringing in W0 into the dot product:
$$ g(X_i) = W_0 \cdot X_{i0}+ (X_i)^T \cdot W $$
Where Xi0 = 1
Hence:
W = [W0 W1 W2 ...Wn ]
Xi = [Xi0 Xi1 Xi2...Xin]
∴ $$ W^T \cdot X_i = (X_i)^T \cdot W = W_0 + (X_i)^T \cdot W $$
w_new = [w0] + w
[1] + [1, 2, 3]
[1, 1, 2, 3]
w_new # acts as our W in our equation
[7.17, 0.01, 0.04, 0.002]
def linear_regression(xi):
xi = np.insert(xi, 0, 1 ) # adding 1 at index 0 to be Xi0 since xi is an array object
return dot(xi, w_new)
linear_regression(xi)
12.312
Linear regression is same even in vector form
# implementing this in a matrix of matrices
x1 = [1, 148, 24, 1385]
x2 = [1, 132, 24, 1385]
x10 = [1, 453, 11, 86]
X = [x1, x2, x10]
X = np.array(X)
X
array([[ 1, 148, 24, 1385],
[ 1, 132, 24, 1385],
[ 1, 453, 11, 86]])
w_new
[7.17, 0.01, 0.04, 0.002]
def linear_regression(X):
return X.dot(w_new)
linear_regression(X)
array([12.38 , 12.22 , 12.312])
Training a linear regression model¶
In our LR we have :
$$ g(X) = (X)^T \cdot W $$
Hence:
$$ X \cdot W = y $$
In our equation, y can only be an approximate value (i.e XW $\approx$ y) as it is impossible to get the exact value
To get a value closest to y we need to check the system W
Substituting W we get:
$$ X \cdot W \approx y $$
$$ X^{-1} \cdot X \cdot W \approx y \cdot X^{-1} $$
$$ I \cdot W \approx y \cdot X^{-1} $$
$$ W \approx y \cdot X^{-1} $$
But X is not a square matrix, so getting its inverse is impossible.
So we need to find a way to convert it to a square matrix
∴ $$ X \cdot W \approx y $$
$$ X^T \cdot X \cdot W \approx y \cdot X^T $$
XT X is a gram matrix - a square matrix of X
Hence we need to multiply it by the inverse to seclude our W
$$ (X^T \cdot X)^{-1} \cdot (X^T \cdot X) \cdot W \approx y \cdot X^T \cdot (X^T \cdot X)^{-1} $$
$$ I \cdot W \approx y \cdot X^T \cdot (X^T \cdot X)^{-1} $$
$$ W \approx y \cdot X^T \cdot (X^T \cdot X)^{-1} $$
def train_linear_regression():
pass
X = [
[148, 24, 1385],
[132, 25, 2031],
[453, 11, 86],
[158, 24, 185],
[172, 25, 201],
[413, 11, 86],
[38, 54, 185],
[142, 25, 431],
[453, 31, 86],
]
X = np.array(X)
X
array([[ 148, 24, 1385],
[ 132, 25, 2031],
[ 453, 11, 86],
[ 158, 24, 185],
[ 172, 25, 201],
[ 413, 11, 86],
[ 38, 54, 185],
[ 142, 25, 431],
[ 453, 31, 86]])
XTX = X.T.dot(X)
XTX
array([[ 696471, 44115, 718540],
[ 44115, 7146, 118803],
[ 718540, 118803, 6359986]])
XTX_inv = np.linalg.inv(XTX)
XTX_inv.round()
array([[ 0., -0., 0.],
[-0., 0., -0.],
[ 0., -0., 0.]])
XTX.dot(XTX_inv).round() # proof exists
array([[ 1., 0., 0.],
[-0., 1., 0.],
[ 0., 0., 1.]])
y = [100, 200, 150, 250, 100, 200, 150, 250, 120 ]
XTX_inv.dot(X.T).dot(y)
array([0.26190562, 3.06101252, 0.03696909])
# adding bias term
X.shape
(9, 3)
ones = np.ones(X.shape[0])
X = np.column_stack([ones, X])
X
array([[1.000e+00, 1.480e+02, 2.400e+01, 1.385e+03],
[1.000e+00, 1.320e+02, 2.500e+01, 2.031e+03],
[1.000e+00, 4.530e+02, 1.100e+01, 8.600e+01],
[1.000e+00, 1.580e+02, 2.400e+01, 1.850e+02],
[1.000e+00, 1.720e+02, 2.500e+01, 2.010e+02],
[1.000e+00, 4.130e+02, 1.100e+01, 8.600e+01],
[1.000e+00, 3.800e+01, 5.400e+01, 1.850e+02],
[1.000e+00, 1.420e+02, 2.500e+01, 4.310e+02],
[1.000e+00, 4.530e+02, 3.100e+01, 8.600e+01]])
XTX = X.T.dot(X)
XTX
array([[9.000000e+00, 2.109000e+03, 2.300000e+02, 4.676000e+03],
[2.109000e+03, 6.964710e+05, 4.411500e+04, 7.185400e+05],
[2.300000e+02, 4.411500e+04, 7.146000e+03, 1.188030e+05],
[4.676000e+03, 7.185400e+05, 1.188030e+05, 6.359986e+06]])
XTX_inv = np.linalg.inv(XTX)
XTX_inv
array([[ 3.30686958e+00, -5.39612291e-03, -6.21325581e-02,
-6.61016816e-04],
[-5.39612291e-03, 1.11633857e-05, 8.66973393e-05,
1.08664195e-06],
[-6.21325581e-02, 8.66973393e-05, 1.46189255e-03,
8.57849603e-06],
[-6.61016816e-04, 1.08664195e-06, 8.57849603e-06,
3.60215866e-07]])
w_full = XTX_inv.dot(X.T).dot(y)
w_full
array([ 3.00067767e+02, -2.27742529e-01, -2.57694130e+00, -2.30120640e-02])
w0 = w_full[0]
w = w_full[1:]
# coefficients for linear regression
w0, w
(300.0677669255554, array([-0.22774253, -2.5769413 , -0.02301206]))
def train_linear_regression(X, y):
ones = np.ones(X.shape[0])
X = np.column_stack([ones, X])
XTX = X.T.dot(X)
XTX_inv = np.linalg.inv(XTX)
w_full = XTX_inv.dot(X.T).dot(y)
return w_full[0], w_full[1:]
X = [
[148, 24, 1385],
[132, 25, 2031],
[453, 11, 86],
[158, 24, 185],
[172, 25, 201],
[413, 11, 86],
[38, 54, 185],
[142, 25, 431],
[453, 31, 86],
]
X = np.array(X)
X
array([[ 148, 24, 1385],
[ 132, 25, 2031],
[ 453, 11, 86],
[ 158, 24, 185],
[ 172, 25, 201],
[ 413, 11, 86],
[ 38, 54, 185],
[ 142, 25, 431],
[ 453, 31, 86]])
train_linear_regression(X, y)
(300.0677669255554, array([-0.22774253, -2.5769413 , -0.02301206]))
Car price baseline model¶
df_train.dtypes
make object model object year int64 engine_fuel_type object engine_hp float64 engine_cylinders float64 transmission_type object driven_wheels object number_of_doors float64 market_category object vehicle_size object vehicle_style object highway_mpg int64 city_mpg int64 popularity int64 dtype: object
# using numerical columns
base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
df_train[base]
| engine_hp | engine_cylinders | highway_mpg | city_mpg | popularity | |
|---|---|---|---|---|---|
| 0 | 148.0 | 4.0 | 33 | 24 | 1385 |
| 1 | 132.0 | 4.0 | 32 | 25 | 2031 |
| 2 | 148.0 | 4.0 | 37 | 28 | 640 |
| 3 | 90.0 | 4.0 | 18 | 16 | 873 |
| 4 | 385.0 | 8.0 | 21 | 15 | 5657 |
| ... | ... | ... | ... | ... | ... |
| 7145 | 300.0 | 6.0 | 31 | 20 | 3916 |
| 7146 | 210.0 | 4.0 | 30 | 24 | 873 |
| 7147 | 285.0 | 6.0 | 22 | 17 | 549 |
| 7148 | 563.0 | 12.0 | 21 | 13 | 86 |
| 7149 | 200.0 | 4.0 | 31 | 22 | 873 |
7150 rows × 5 columns
# to extract the values into a numpy array
X_train = df_train[base].values
X_train
array([[ 148., 4., 33., 24., 1385.],
[ 132., 4., 32., 25., 2031.],
[ 148., 4., 37., 28., 640.],
...,
[ 285., 6., 22., 17., 549.],
[ 563., 12., 21., 13., 86.],
[ 200., 4., 31., 22., 873.]])
y_train
array([ 9.57574708, 9.887663 , 9.89323518, ..., 10.45380308,
12.62248099, 10.54061978])
df_train[base].isna().sum()
engine_hp 40 engine_cylinders 14 highway_mpg 0 city_mpg 0 popularity 0 dtype: int64
# making model ignore these null features
X_train = df_train[base].fillna(0).values
# getting our weights
w0, w = train_linear_regression(X_train, y_train)
print(len(w))
print(X_train.shape)
5 (7150, 5)
y_pred = w0 + X_train.dot(w)
y_pred
array([ 9.54792783, 9.38733977, 9.67197758, ..., 10.30423015,
11.9778914 , 9.99863111])
sns.histplot(y_pred, color='red', alpha=0.5) # predictions
sns.histplot(y_train, color='blue', alpha=0.5) # target variables
<Axes: ylabel='Count'>
Prediction is less than the target variable hence our model is not ideal
RMSE¶
Quantifying how well the model is
def rmse(y, y_pred):
error = y - y_pred
se = error ** 2
mse = se.mean()
return np.sqrt(mse)
rmse(y_train, y_pred)
0.7554192603920132
Validating the model¶
df_val[base]
| engine_hp | engine_cylinders | highway_mpg | city_mpg | popularity | |
|---|---|---|---|---|---|
| 0 | 200.0 | 4.0 | 25 | 19 | 1385 |
| 1 | 241.0 | 4.0 | 29 | 22 | 617 |
| 2 | 160.0 | 4.0 | 36 | 26 | 5657 |
| 3 | 290.0 | 6.0 | 34 | 21 | 204 |
| 4 | 170.0 | 4.0 | 34 | 25 | 873 |
| ... | ... | ... | ... | ... | ... |
| 2377 | 240.0 | 4.0 | 37 | 25 | 870 |
| 2378 | 444.0 | 8.0 | 20 | 13 | 238 |
| 2379 | 332.0 | 8.0 | 23 | 20 | 1624 |
| 2380 | 148.0 | 4.0 | 34 | 24 | 436 |
| 2381 | 290.0 | 6.0 | 25 | 18 | 1720 |
2382 rows × 5 columns
# getting our feature matrix X
def prepare_x(df):
df_num = df[base]
df_num = df[base].fillna(0)
X = df_num.values
return X
# training part
X_train = prepare_x(df_train) # extract clean X matrix
w0, w = train_linear_regression(X_train, y_train) # gets weights for features of x
# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)
0.761653099130156
Feature Engineering¶
df_train[base]
| engine_hp | engine_cylinders | highway_mpg | city_mpg | popularity | |
|---|---|---|---|---|---|
| 0 | 148.0 | 4.0 | 33 | 24 | 1385 |
| 1 | 132.0 | 4.0 | 32 | 25 | 2031 |
| 2 | 148.0 | 4.0 | 37 | 28 | 640 |
| 3 | 90.0 | 4.0 | 18 | 16 | 873 |
| 4 | 385.0 | 8.0 | 21 | 15 | 5657 |
| ... | ... | ... | ... | ... | ... |
| 7145 | 300.0 | 6.0 | 31 | 20 | 3916 |
| 7146 | 210.0 | 4.0 | 30 | 24 | 873 |
| 7147 | 285.0 | 6.0 | 22 | 17 | 549 |
| 7148 | 563.0 | 12.0 | 21 | 13 | 86 |
| 7149 | 200.0 | 4.0 | 31 | 22 | 873 |
7150 rows × 5 columns
df_train
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 |
| 1 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 |
| 2 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 |
| 3 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 |
| 4 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7145 | bmw | 4_series | 2015 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | convertible | 31 | 20 | 3916 |
| 7146 | volkswagen | beetle | 2015 | premium_unleaded_(recommended) | 210.0 | 4.0 | automated_manual | front_wheel_drive | 2.0 | hatchback,performance | compact | 2dr_hatchback | 30 | 24 | 873 |
| 7147 | gmc | sierra_1500 | 2015 | flex-fuel_(unleaded/e85) | 285.0 | 6.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | extended_cab_pickup | 22 | 17 | 549 |
| 7148 | rolls-royce | ghost | 2014 | premium_unleaded_(required) | 563.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury,performance | large | sedan | 21 | 13 | 86 |
| 7149 | volkswagen | cc | 2017 | premium_unleaded_(recommended) | 200.0 | 4.0 | automated_manual | front_wheel_drive | 4.0 | performance | midsize | sedan | 31 | 22 | 873 |
7150 rows × 15 columns
Lets try adding more features into our feature matrix X to see how the RMSE behaves as we are currently at RMSE: 0.761653099130156 on the validaton dataset
df_train.year.max()
2017
# adding age to our feature matrix
def prepare_x(df):
df = df.copy() # so as to not modify the original df
df['age'] = df.year.max() - df.year
features = base + ['age']
df_num = df[features]
df_num = df_num.fillna(0)
X = df_num.values
return X
X_train = prepare_x(df_train)
X_train
array([[1.480e+02, 4.000e+00, 3.300e+01, 2.400e+01, 1.385e+03, 9.000e+00],
[1.320e+02, 4.000e+00, 3.200e+01, 2.500e+01, 2.031e+03, 5.000e+00],
[1.480e+02, 4.000e+00, 3.700e+01, 2.800e+01, 6.400e+02, 1.000e+00],
...,
[2.850e+02, 6.000e+00, 2.200e+01, 1.700e+01, 5.490e+02, 2.000e+00],
[5.630e+02, 1.200e+01, 2.100e+01, 1.300e+01, 8.600e+01, 3.000e+00],
[2.000e+02, 4.000e+00, 3.100e+01, 2.200e+01, 8.730e+02, 0.000e+00]])
# training part
# X_train = prepare_x(df_train)
w0, w = train_linear_regression(X_train, y_train)
# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)
0.5172055461058299
The lower RMSE means the model’s predictions are closer to the actual values, indicating better model performance.
sns.histplot(y_pred, color='red', alpha=0.5, bins=50) # predictions
sns.histplot(y_val, color='blue', alpha=0.5, bins=50) # target variables
<Axes: ylabel='Count'>
Categorical Values¶
df_train
| make | model | year | engine_fuel_type | engine_hp | engine_cylinders | transmission_type | driven_wheels | number_of_doors | market_category | vehicle_size | vehicle_style | highway_mpg | city_mpg | popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chevrolet | cobalt | 2008 | regular_unleaded | 148.0 | 4.0 | manual | front_wheel_drive | 2.0 | NaN | compact | coupe | 33 | 24 | 1385 |
| 1 | toyota | matrix | 2012 | regular_unleaded | 132.0 | 4.0 | automatic | front_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 32 | 25 | 2031 |
| 2 | subaru | impreza | 2016 | regular_unleaded | 148.0 | 4.0 | automatic | all_wheel_drive | 4.0 | hatchback | compact | 4dr_hatchback | 37 | 28 | 640 |
| 3 | volkswagen | vanagon | 1991 | regular_unleaded | 90.0 | 4.0 | manual | rear_wheel_drive | 3.0 | NaN | large | passenger_minivan | 18 | 16 | 873 |
| 4 | ford | f-150 | 2017 | flex-fuel_(unleaded/e85) | 385.0 | 8.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | crew_cab_pickup | 21 | 15 | 5657 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7145 | bmw | 4_series | 2015 | premium_unleaded_(required) | 300.0 | 6.0 | automatic | rear_wheel_drive | 2.0 | luxury,performance | midsize | convertible | 31 | 20 | 3916 |
| 7146 | volkswagen | beetle | 2015 | premium_unleaded_(recommended) | 210.0 | 4.0 | automated_manual | front_wheel_drive | 2.0 | hatchback,performance | compact | 2dr_hatchback | 30 | 24 | 873 |
| 7147 | gmc | sierra_1500 | 2015 | flex-fuel_(unleaded/e85) | 285.0 | 6.0 | automatic | four_wheel_drive | 4.0 | flex_fuel | large | extended_cab_pickup | 22 | 17 | 549 |
| 7148 | rolls-royce | ghost | 2014 | premium_unleaded_(required) | 563.0 | 12.0 | automatic | rear_wheel_drive | 4.0 | exotic,luxury,performance | large | sedan | 21 | 13 | 86 |
| 7149 | volkswagen | cc | 2017 | premium_unleaded_(recommended) | 200.0 | 4.0 | automated_manual | front_wheel_drive | 4.0 | performance | midsize | sedan | 31 | 22 | 873 |
7150 rows × 15 columns
df_train.dtypes[df_train.dtypes == 'object']
make object model object engine_fuel_type object transmission_type object driven_wheels object market_category object vehicle_size object vehicle_style object dtype: object
df_train['number_of_doors'].unique()
array([ 2., 4., 3., nan])
# adding number of doors in our X-feature matrx
def prepare_x(df):
df = df.copy() # so as to not modify the original df
features = base.copy()
df['age'] = df.year.max() - df.year
features.append('age')
# converting to binary column for each number of door
for v in [2, 3, 4]:
df['num_doors_%s' % v] = (df['number_of_doors'] == v).astype('int')
features.append('num_doors_%s' % v)
df_num = df[features]
df_num = df_num.fillna(0)
X = df_num.values
return X
# training part
X_train = prepare_x(df_train)
w0, w = train_linear_regression(X_train, y_train)
# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)
0.5157995641501689
∴ Adding no. of doors into our feature matrix slightly improves our model
makes = list(df['make'].value_counts().head(10).index)
makes
['chevrolet', 'ford', 'volkswagen', 'toyota', 'dodge']
# adjusting our feature matrix M
def prepare_x(df):
df = df.copy() # so as to not modify the original df
features = base.copy()
df['age'] = df.year.max() - df.year
features.append('age')
# adding no. of doors to our feature matrix by:
# converting to binary column for each number of door
for v in [2, 3, 4]:
df['num_doors_%s' % v] = (df['number_of_doors'] == v).astype('int')
features.append('num_doors_%s' % v)
# adding type of make to our feature matrix by:
# converting to binary column for each make
for v in makes:
df['make_%s' % v] = (df['make'] == v).astype('int')
features.append('make_%s' % v)
df_num = df[features]
df_num = df_num.fillna(0)
X = df_num.values
return X
# training part
X_train = prepare_x(df_train)
w0, w = train_linear_regression(X_train, y_train)
# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)
0.5076038849556795
∴ Adding most popular make into our feature matrix slightly improves our model
df_train.dtypes
make object model object year int64 engine_fuel_type object engine_hp float64 engine_cylinders float64 transmission_type object driven_wheels object number_of_doors float64 market_category object vehicle_size object vehicle_style object highway_mpg int64 city_mpg int64 popularity int64 dtype: object
categorical_variables = ['make', 'engine_fuel_type', 'transmission_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style', ]
for x in categorical_variables:
print(df[x].nunique())
print(df[x].value_counts().head())
print("")
48 make chevrolet 1123 ford 881 volkswagen 809 toyota 746 dodge 626 Name: count, dtype: int64 10 engine_fuel_type regular_unleaded 7172 premium_unleaded_(required) 2009 premium_unleaded_(recommended) 1523 flex-fuel_(unleaded/e85) 899 diesel 154 Name: count, dtype: int64 5 transmission_type automatic 8266 manual 2935 automated_manual 626 direct_drive 68 unknown 19 Name: count, dtype: int64 4 driven_wheels front_wheel_drive 4787 rear_wheel_drive 3371 all_wheel_drive 2353 four_wheel_drive 1403 Name: count, dtype: int64 71 market_category crossover 1110 flex_fuel 872 luxury 855 luxury,performance 673 hatchback 641 Name: count, dtype: int64 3 vehicle_size compact 4764 midsize 4373 large 2777 Name: count, dtype: int64 16 vehicle_style sedan 3048 4dr_suv 2488 coupe 1211 convertible 793 4dr_hatchback 702 Name: count, dtype: int64
# creating a dictionary to store the categorical variables
# with their most popular values
categories = {}
for c in categorical_variables:
categories[c] = list(df[c].value_counts().head().index)
categories
{'make': ['chevrolet', 'ford', 'volkswagen', 'toyota', 'dodge'],
'engine_fuel_type': ['regular_unleaded',
'premium_unleaded_(required)',
'premium_unleaded_(recommended)',
'flex-fuel_(unleaded/e85)',
'diesel'],
'transmission_type': ['automatic',
'manual',
'automated_manual',
'direct_drive',
'unknown'],
'driven_wheels': ['front_wheel_drive',
'rear_wheel_drive',
'all_wheel_drive',
'four_wheel_drive'],
'market_category': ['crossover',
'flex_fuel',
'luxury',
'luxury,performance',
'hatchback'],
'vehicle_size': ['compact', 'midsize', 'large'],
'vehicle_style': ['sedan',
'4dr_suv',
'coupe',
'convertible',
'4dr_hatchback']}
# adjusting our feature matrix
def prepare_x(df):
df = df.copy() # so as to not modify the original df
features = base.copy()
df['age'] = df.year.max() - df.year
features.append('age')
# adding no. of doors to our feature matrix by:
# converting to binary column for each number of door
for v in [2, 3, 4]:
df['num_doors_%s' % v] = (df['number_of_doors'] == v).astype('int')
features.append('num_doors_%s' % v)
# adding catgorical variables to our feature matrix by:
# looping through each key and value in category dict then
# converting to binary column for each value
for c, values in categories.items():
for v in values:
df['%s_%s' % (c, v)] = (df[c] == v).astype('int')
features.append('%s_%s' % (c, v))
df_num = df[features]
df_num = df_num.fillna(0)
X = df_num.values
return X
# training part
X_train = prepare_x(df_train)
w0, w = train_linear_regression(X_train, y_train)
# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)
30.1564951732265
w0
6775859061080289.0
Regularization¶
XTX = [
[1, 2, 2],
[2, 1, 1],
[2, 1, 1]
] # we have a duplicate column
XTX = np.array(XTX)
XTX
array([[1, 2, 2],
[2, 1, 1],
[2, 1, 1]])
# XTX_inv = np.linalg.inv(XTX) # cannot find inverse as it is a singular matrix
# hence we modify by adding small number
XTX = [
[1, 2, 2],
[2, 1, 1.0000001],
[2, 1.00000001, 1]
]
XTX = np.array(XTX)
XTX
array([[1. , 2. , 2. ],
[2. , 1. , 1.0000001 ],
[2. , 1.00000001, 1. ]])
XTX_inv = np.linalg.inv(XTX)
XTX_inv
array([[-3.33333337e-01, 6.06060587e-02, 6.06060610e-01],
[ 6.06060608e-01, -9.09090913e+06, 9.09090882e+06],
[ 6.06060605e-02, 9.09090910e+06, -9.09090913e+06]])
# hence we modify by adding small number in diagonal
# the larger the no.s we add to diagonal the more we have weights in control
XTX = XTX + (0.01 * np.eye(3))
XTX
array([[2.12 , 2. , 2. ],
[2. , 2.12 , 1.0000001 ],
[2. , 1.00000001, 2.12 ]])
np.linalg.inv(XTX)
array([[-2.25173233, 1.44341819, 1.44341808],
[ 1.44341808, -0.31858302, -1.21144014],
[ 1.44341819, -1.21144027, -0.31858302]])
# adding regularization parameter
def train_linear_regression_reg(X, y, r=0.001):
ones = np.ones(X.shape[0])
X = np.column_stack([ones, X])
XTX = X.T.dot(X)
XTX = XTX + (r * np.eye(XTX.shape[0]))
XTX_inv = np.linalg.inv(XTX)
w_full = XTX_inv.dot(X.T).dot(y)
return w_full[0], w_full[1:]
# training part
X_train = prepare_x(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=0.01)
# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)
0.4565219901446315
Hence by adding a number to the diagonal we were able to control our weights, to regularize our model
Tuning the model¶
Finding the best regularization parameter for our linear regression model
for r in [0.0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]:
X_train = prepare_x(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=r)
# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)
score = rmse(y_val, y_pred)
print(r, w0, score)
print()
0.0 6775859061080289.0 30.1564951732265 1e-05 6.590905413435901 0.4565170280575304 0.0001 6.29829381216216 0.4565170612411519 0.001 6.285877621486336 0.4565175087348855 0.01 6.276610129017163 0.4565219901446315 0.1 6.191208692817916 0.4565692763026095 1 5.634896667769638 0.45722043179983024 10 4.283980108950864 0.47014569321001515
r = 0.001
X_train = prepare_x(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=r)
# validation
X_val = prepare_x(df_val)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)
0.4565175087348855